/******************************************************************************
Author: Akshay Dixit
Date created: February 21, 2019
Data last modified: June 14, 2020
File name: 1_tz.do
Project: T4D Tanzania 
Purpose: Baseline data on primary & secondary outcomes
******************************************************************************/

/*

	This .do file prepares a village-level dataset of average outcomes at baseline, for
	primary and secondary outcomes (see Pre-Analysis Plan for how these outcomes are defined).

	The purpose of preparing these average outcomes is to include Baseline village-level averages in 
	the final regression analyses, as a robustness check.

	This do file calculates average baseline values for the following outcomes:
	
		Primary
			- First ANC visit within the first trimester (both skilled and unskilled providers included)
			- Four or more ANC visits (with a skilled provider)
			- Delivery with a skilled birth attendant
			- Delivery at a health facility
			- Empowerment: Participation index (outcome components different at baseline/endline)
			- Empowerment: Vignettes (based on a self-efficacy question at baseline, no vignette)
			- Postnatal care (mother & baby)
			- Weight-for-age
			- Height-for-age
			- Content of Care index (Outcome components different at baseline/endline)
		
		Secondary
			- Birth weight
			- Birth preparedness
			- Maternal depression
			
	NOTE: The data on primary outcomes from the Tanzania baseline survey is 
	split into two .dta files - bl_HHdata and bl_HHbirth.
	Accordingly, this .do file is organized by .dta file. First, outcomes in bl_HHData
	are organized, followed by outcomes in bl_HHBirth.
	
	In addition, the .do file also computes measures of baseline health facility quality, to be used
	in the subgroup analysis.
	This includes:
			1. Facility has specific room for deliveries
			2. Delivery room has some form of auditory and/or visual privacy
			3. Delivery room bed is clean
			4. Delivery room floor is clean
			5. Delivery room is well-ventilated or air-conditioned
			6. Biological/medical waste disposal available
			7. No dust/mold observed in delivery room
			8. Separate bin for placenta disposal
			9. Water is available to flush in toilet
			10. Water is available to wash hands in toilet
			11. Soap is available in toilet
			12. MNH staff present 24*7			
			
*/


clear all
set more off

cd "$data"

********************************************************************************
* OUTCOMES IN THE HHData DATASET
********************************************************************************

* Merge treatment and strata ID

u "bl_HHData.dta", clear

merge m:1 VillageID using "sample_villages.dta"
drop if _merge == 1	//Baseline dataset has 202 villages, drop the 2 not in the sample
drop _merge

ren VillageID village_id
merge m:1 village_id using "randomization_strata.dta"
drop _merge

g strata1 = (strata_var == 1)
g strata2 = (strata_var == 2)
g strata3 = (strata_var == 3)
g strata4 = (strata_var == 4)
g strata5 = (strata_var == 5)
g strata6 = (strata_var == 6)
g strata7 = (strata_var == 7)
g strata8 = (strata_var == 8)

********************************************************************************

* Delivery with a skilled birth attendant

g bl_skilled_provider_birth = (hh4assistdel1 == 1 | hh4assistdel2 == 2)
replace bl_skilled_provider_birth = . if hh4assistdel == ""
tab bl_skilled_provider_birth

********************************************************************************

* Empowerment - self-efficacy

g bl_ks11 = hh7decision
replace bl_ks11 = . if bl_ks11 == -97
tab bl_ks11

********************************************************************************	

* Empowerment - participation

local participation hh7meeting hh7metpolitician hh7protest hh7infocampaign hh7alertnews hh7notifypolice hh7petitiongovt
foreach var of local participation {
	tab `var'
	replace `var' = . if (`var' == -97)
	g `var'_dummy = (`var' == 1)
	replace `var'_dummy = . if `var' == .
}
	
	*1. All variables are already oriented so that the higher value (1) represents the "better" outcome
	
	*3. Impute missing values at treatment assignment group mean
	local index_participation hh7meeting_dummy hh7metpolitician_dummy hh7protest_dummy hh7infocampaign_dummy hh7alertnews_dummy hh7notifypolice_dummy hh7petitiongovt_dummy
	foreach var of local index_participation {
		bysort treatment: egen mean_`var' = mean(`var')
		replace `var' = mean_`var' if `var' == .
	}
	drop mean_* 
	
	*2. Standardize each outcome - using the mean and s.d. of the control group
	local index_participation hh7meeting_dummy hh7metpolitician_dummy hh7protest_dummy hh7infocampaign_dummy hh7alertnews_dummy hh7notifypolice_dummy hh7petitiongovt_dummy
	foreach var of local index_participation {
		sum `var' if treatment == "Control"
		scalar mean_`var' = r(mean)
		scalar sd_`var' = r(sd)
		g z_`var' = (`var' - mean_`var')/sd_`var'
	}
	
	*4. Compile summary index
	egen bl_participation = rowmean(z_hh7meeting_dummy z_hh7metpolitician_dummy z_hh7protest_dummy z_hh7infocampaign_dummy z_hh7alertnews_dummy z_hh7notifypolice_dummy z_hh7petitiongovt_dummy) 
	sum bl_participation

********************************************************************************	

* Birth preparedness			
local bp hh6delprep1 hh6delprep2 hh6delprep3 hh6delprep4 hh6delprep5 hh6delprep6 hh6delprep7
foreach var of local bp {
	g `var'_binary = (`var' != .)
	replace `var'_binary = . if hh6delprep == ""
	tab `var'_binary
}
egen bl_birth_preparedness = rowtotal(hh6delprep1_binary hh6delprep2_binary hh6delprep3_binary hh6delprep4_binary hh6delprep5_binary hh6delprep6_binary hh6delprep7_binary), missing

********************************************************************************

* Maternal depression
desc hh8nervous hh8hopeless hh8fidgety hh8depressed hh8effort hh8worthless

	//Converting this from a 1-5 to a 0-4 scale
lab drop L_AllNone

local hh8 hh8nervous hh8hopeless hh8fidgety hh8depressed hh8effort hh8worthless
foreach var of local hh8 {
	replace `var' = . if `var' == -99 | `var' == -97
	replace `var' = `var' - 1 if `var' != .
	tab `var'
}

egen bl_mother_k6_score = rowtotal(hh8nervous hh8hopeless hh8fidgety hh8depressed hh8effort hh8worthless), missing
replace bl_mother_k6_score = . if hh8nervous == . | hh8hopeless == . | hh8fidgety == . | hh8depressed == . | hh8effort == . | hh8worthless == .
tab bl_mother_k6_score	//The higher the K6 score, the lower the level of depression

	/*
	
	The Harvard Med School doesn't provide any specific guidance on how to treat missing values.
	Here, if any of the 6 components is missing, the total K6 score is coded as missing. 
	This approach is conservative, and with this approach there are 149 missing values.
	
	*/
	
********************************************************************************

* First ANC visit within the first trimester
tab hh4ancmth
replace hh4ancmth = . if hh4ancmth < 0
g bl_anc_first_trimester = (hh4ancmth <= 3)
replace bl_anc_first_trimester = . if hh4ancmth == .

********************************************************************************
	
* Four or more ANC visits (with a skilled provider)
tab hh4nberanc
g bl_number_anc_visits_binary = (hh4nberanc >= 4)
replace bl_number_anc_visits_binary = . if hh4nberanc == .
tab bl_number_anc_visits_binary

********************************************************************************
	
* Village level averages

*Calculating village-level averages
local outcomes bl_anc_first_trimester bl_birth_preparedness bl_mother_k6_score bl_participation bl_skilled_provider_birth bl_ks11 bl_number_anc_visits_binary
foreach var of local outcomes {
	bysort village_id: egen m_`var' = mean(`var')
}

*Collapsing the data-set by village ID
contract village_id m_bl_number_anc_visits_binary m_bl_anc_first_trimester m_bl_birth_preparedness m_bl_mother_k6_score m_bl_participation m_bl_skilled_provider_birth m_bl_ks11 

drop _freq

lab var m_bl_anc_first_trimester "Baseline village-level average: ANC within the first trimester"
lab var m_bl_number_anc_visits_binary "Baseline village-level average: 4 or more ANC visits"
lab var m_bl_skilled_provider_birth "Baseline village-level average: Birth with a skilled provider" 
lab var m_bl_participation "Baseline Empowerment - Participation"
lab var m_bl_ks11 "Baseline Empowerment - Self Efficacy"
lab var m_bl_birth_preparedness "Baseline village-level average: Birth preparedness"
lab var m_bl_mother_k6_score "Baseline village-level average: Maternal depression"

save "baseline_outcomes_by_village.dta", replace

********************************************************************************
* OUTCOMES IN HHBirth
********************************************************************************

u "bl_HHBirth.dta", clear

* Merge treatment and strata ID

replace VillageID = 1419004 if VillageID == 999903

merge m:1 VillageID using "sample_villages.dta"
drop if _merge == 1	//Baseline dataset has 202 villages, drop the 2 not in the sample
drop _merge

ren VillageID village_id
merge m:1 village_id using "randomization_strata.dta"
drop _merge

g strata1 = (strata_var == 1)
g strata2 = (strata_var == 2)
g strata3 = (strata_var == 3)
g strata4 = (strata_var == 4)
g strata5 = (strata_var == 5)
g strata6 = (strata_var == 6)
g strata7 = (strata_var == 7)
g strata8 = (strata_var == 8)

********************************************************************************

* Keep only most recent births

tostring village_id, g(VillageID_str)
tostring HHID, g(HHID_str)
tostring BirthID, g(BirthID_str)

drop if hh3birthyr < 2014

egen babyID = concat(VillageID_str HHID_str BirthID_str)
egen household_ID = concat(VillageID_str HHID_str)
duplicates tag household_ID, gen(dup_hhid)	
drop if hh3birthyr == 2014 & dup_hhid != 0 	 

	/*
	
	In Tanzania, data collection took place between March to July 2015. 
	Hence, any births in 2010-13 were clearly not the most recent births. 
	These are dropped from the analysis.
	Then, among the households that listed multiple births, births in 2014 are dropped.
	Those in 2015 are kept.
	This leaves 3001 births.
	
	*/

********************************************************************************

* Birth at a facility 
tab hh3deliveryloc
g bl_facility_birth = (hh3deliveryloc == 1)
replace bl_facility_birth = . if hh3deliveryloc == .

********************************************************************************

* Birth weight
replace hh4weightcard = . if hh4weightcard < 0
replace hh4weightrec = . if hh4weightrec < 0

g birthweight = hh4weightcard
replace birthweight = hh4weightrec if hh4weightcard == .

g bl_low_birthweight = (birthweight < 2.5)
replace bl_low_birthweight = . if birthweight == .

********************************************************************************
	
* Village level averages

*Calculating village-level averages
local outcomes bl_facility_birth bl_low_birthweight
foreach var of local outcomes {
	bysort village_id: egen m_`var' = mean(`var')
}

*Collapsing the data-set by village ID
contract village_id m_bl_facility_birth m_bl_low_birthweight

drop _freq

lab var m_bl_facility_birth "Baseline village-level average: Birth at a facility"
lab var m_bl_low_birthweight "Baseline village-level average: Low birthweight"

merge 1:1 village_id using "baseline_outcomes_by_village.dta"
drop _merge

save "baseline_outcomes_by_village.dta", replace

********************************************************************************

* Weight for age

u "bl_HHBirth.dta", clear

replace VillageID = 1419004 if VillageID == 999903
merge m:1 VillageID using "sample_villages.dta"
drop if _merge == 1	
drop _merge
ren VillageID village_id

* Keep only most recent births
tostring village_id, g(VillageID_str)
tostring HHID, g(HHID_str)
tostring BirthID, g(BirthID_str)

drop if hh3birthyr < 2014

egen babyID = concat(VillageID_str HHID_str BirthID_str)
egen household_ID = concat(VillageID_str HHID_str)
duplicates tag household_ID, gen(dup_hhid)	
drop if hh3birthyr == 2014 & dup_hhid != 0 	

merge m:1 household_ID using "bl_HHData.dta"	
keep if _merge == 3
drop _merge 

desc hh4weighnow* hh4lengthnow* hh3birthmonth hh3birthyr hh3bbsex hh3bbmulti
count if hh4weighnow1 != . & hh4weighnow2 != .	//2915 observations
tab hh4weighnow3	//282 cases with a third measurement
tab hh3bbmulti		//61 multiple births
	
	/*
	It is going to be impossible to determine which of the twins was born first. 
	Plus, only village level averages are needed for the analysis, and ~30 twins 
	won't sway averages much one way or another.  
	
	Hence, multiple births are included in these calculations.
	*/
	
	//Age of the child
	g hh3birthday = 15 
	order hh3birthday, after(hh3birthyr)
	
		*Date of birth variable
	g date_birth = mdy(hh3birthmonth, hh3birthday, hh3birthyr)
	format date_birth %d

		*Survey date variable
	g date_of_survey = dofc(hh9endtme1)
	format date_of_survey %td

		*Age of the infant
	g days_between = date_of_survey-date_birth
	g age_in_months = round(days_between/30) 
	drop days_between
	
	
	//Sex of the child
	ren hh3bbsex sex_of_infant
	
	//Combine the different readings into a single measure
	g weight_diff_12 = abs(hh4weighnow1 - hh4weighnow2)
	g weight_diff_23 = abs(hh4weighnow2 - hh4weighnow3)
	g weight_diff_13 = abs(hh4weighnow1 - hh4weighnow3)
	egen least_weight_diff = rowmin(weight_diff_12 weight_diff_23 weight_diff_13) 
	egen weight_12 = rowmean(hh4weighnow1 hh4weighnow2)
	egen weight_23 = rowmean(hh4weighnow2 hh4weighnow3)
	egen weight_13 = rowmean(hh4weighnow1 hh4weighnow3)
	g weight = .
	replace weight = weight_12 if least_weight_diff == weight_diff_12
	replace weight = weight_23 if ((least_weight_diff == weight_diff_23) & (hh4weighnow3 != .)) 
	replace weight = weight_13 if ((least_weight_diff == weight_diff_13) & (hh4weighnow3 != .))
	drop *weight_diff* weight_*
	
	//Compute z-score
	egen bl_waz = zanthro(weight, wa, WHO), xvar(age_in_months) gender(sex_of_infant) gencode(male=1, female=2) ageunit(month)
		/*
		93 missing values generated
		86 are cases where the weight is missing.
		The remaining 7 are cases where the Z-score exceeds 5, and zanthro omits them
		*/
		
	//Binary outcome for underweight
	g bl_underweight = (bl_waz < -2)
	replace bl_underweight = . if bl_waz == .

********************************************************************************
	
* Height-for-age

desc hh4lengthnow1 hh4lengthnow2 hh4lengthnow3
count if hh4lengthnow1 != . & hh4lengthnow2 != .	//2914 observations 
tab hh4lengthnow3	//260 cases with a third measurement

	/*
	It is going to be impossible to determine which of the twins was born first. 
	Plus, only village level averages are needed for the analysis, and ~30 twins 
	won't sway averages much one way or another.  
	
	Hence, multiple births are included in these calculations.
	*/
	
	//Combine the different readings into a single measure
	g height_diff_12 = abs(hh4lengthnow1 - hh4lengthnow2)
	g height_diff_23 = abs(hh4lengthnow2 - hh4lengthnow3)
	g height_diff_13 = abs(hh4lengthnow1 - hh4lengthnow3)
	egen least_height_diff = rowmin(height_diff_12 height_diff_23 height_diff_13) 
	egen height_12 = rowmean(hh4lengthnow1 hh4lengthnow2)
	egen height_23 = rowmean(hh4lengthnow2 hh4lengthnow3)
	egen height_13 = rowmean(hh4lengthnow1 hh4lengthnow3)
	g height = .
	replace height = height_12 if least_height_diff == height_diff_12
	replace height = height_23 if ((least_height_diff == height_diff_23) & (hh4lengthnow3 != .)) 
	replace height = height_13 if ((least_height_diff == height_diff_13) & (hh4lengthnow3 != .))
	drop *height_diff* height_*
	
	//Compute z-score
	egen bl_haz = zanthro(height, ha, WHO), xvar(age_in_months) gender(sex_of_infant) gencode(male=1, female=2) ageunit(month) nocutoff
		/*
		59 missing values generated
		44 are cases where the weight is missing.
		The remaining 15 are cases where the Z-score exceeds 5, and zanthro omits them
		*/
	
	//Binary variable for stunting
	g bl_stunted = (bl_haz < -2)
	replace bl_stunted = . if bl_haz == .	

********************************************************************************
	
* Village level averages

*Calculating village-level averages
local outcomes bl_underweight bl_stunted
foreach var of local outcomes {
	bysort village_id: egen m_`var' = mean(`var')
}

*Collapsing the data-set by village ID
contract village_id m_bl_underweight m_bl_stunted

drop _freq

lab var m_bl_stunted "Baseline village-level average: Stunted"
lab var m_bl_underweight "Baseline village-level average: Underweight"

merge 1:1 village_id using "baseline_outcomes_by_village.dta"
drop _merge

save "baseline_outcomes_by_village.dta", replace

********************************************************************************
* BASELINE FACILITY QUALITY FOR SUBGROUP ANALYSIS
********************************************************************************

u "bl_FCMain.dta", clear	

g room_for_deliveries = (fc2deliveryroom == 1)
replace room_for_deliveries = . if fc2deliveryroom == .

g delivery_room_privacy = (fc2privacy == 1 | fc2privacy == 2 | fc2privacy == 3)
replace delivery_room_privacy = . if fc2privacy == .

g delivery_room_bed_clean = (fc2cleanbed == 1)
replace delivery_room_bed_clean = . if fc2cleanbed == .

g delivery_room_floor_clean = (fc2floor == 1)
replace delivery_room_floor_clean = . if fc2floor == .

g delivery_room_ventilated = (fc2ventilation == 1)
replace delivery_room_ventilated = . if fc2ventilation == .

g delivery_room_waste = (fc2medwaste == 1)
replace delivery_room_waste = . if fc2medwaste == .

g delivery_room_dust = (fc2dust == 2)
replace delivery_room_dust = . if fc2dust == .

g placenta_bin = (fc2placentahand == 3 | fc2placentahand == 4)
replace placenta_bin = . if fc2placentahand == .

g toilet_water_flush = (fc2toiletflush == 1)
replace toilet_water_flush = . if fc2toiletflush == .

g toilet_water_handwash = (fc2toiletwash == 1)
replace toilet_water_handwash = . if fc2toiletwash == .

g toilet_soap = (fc2toiletsoap == 1)
replace toilet_soap = . if fc2toiletsoap == .

g mnh_staff_mobilephone = (fc3howcontact == 1)
replace mnh_staff_mobilephone = . if fc3howcontact == .

***Measures of facility quality*** 

egen capacity_a = rowtotal(room_for_deliveries delivery_room_privacy delivery_room_bed_clean delivery_room_floor_clean ///
delivery_room_ventilated delivery_room_waste delivery_room_dust placenta_bin toilet_water_flush toilet_water_handwash ///
toilet_soap mnh_staff_mobilephone), missing

ren HealthFacilityID health_facility_id
merge 1:1 health_facility_id using "sample_facility.dta"
keep if _merge == 3
drop _merge

keep health_facility_id capacity_a	
save "Baseline_facility_quality.dta", replace

********************************************************************************

clear
